#Air France Case Study Assignment
# First, we load all the necessary libraries
library(readxl)
## Warning: package 'readxl' was built under R version 4.2.2
library(qdap)
## Warning: package 'qdap' was built under R version 4.2.3
## Loading required package: qdapDictionaries
## Loading required package: qdapRegex
## Warning: package 'qdapRegex' was built under R version 4.2.3
## Loading required package: qdapTools
## Warning: package 'qdapTools' was built under R version 4.2.3
## Loading required package: RColorBrewer
##
## Attaching package: 'qdap'
## The following objects are masked from 'package:base':
##
## Filter, proportions
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:qdap':
##
## ngrams
##
## Attaching package: 'tm'
## The following objects are masked from 'package:qdap':
##
## as.DocumentTermMatrix, as.TermDocumentMatrix
library(plotrix)
library(dendextend)
## Warning: package 'dendextend' was built under R version 4.2.3
##
## ---------------------
## Welcome to dendextend version 1.17.1
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags:
## https://stackoverflow.com/questions/tagged/dendextend
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
##
## cutree
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.2
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:qdapTools':
##
## id
## The following object is masked from 'package:qdapRegex':
##
## explain
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(plotly)
## Warning: package 'plotly' was built under R version 4.2.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.2.2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
## The following object is masked from 'package:qdapRegex':
##
## %+%
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(ggplot2)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.2
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble 3.1.8 ✔ purrr 0.3.5
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## Warning: package 'tibble' was built under R version 4.2.2
## Warning: package 'tidyr' was built under R version 4.2.2
## Warning: package 'readr' was built under R version 4.2.2
## Warning: package 'purrr' was built under R version 4.2.2
## Warning: package 'forcats' was built under R version 4.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%() masks qdapRegex::%+%()
## ✖ ggplot2::annotate() masks NLP::annotate()
## ✖ dplyr::explain() masks qdapRegex::explain()
## ✖ plotly::filter() masks dplyr::filter(), stats::filter()
## ✖ dplyr::id() masks qdapTools::id()
## ✖ dplyr::lag() masks stats::lag()
# Next, we read in the dataset from an Excel file
AirFrance <- read_excel("C:/Users/User/Desktop/Air France Dataset.xlsx")
# We check for missing values in the dataset
# and replace blank cells with NA values
missing_vals <- sapply(AirFrance, function(x) sum(is.na(x)))
AirFrance[AirFrance==""] <- NA
# We create a new column for ROA as a KPI
# by dividing the "Amount" column by the "Total Cost" column
AirFrance <- within(AirFrance, ROA <- Amount / `Total Cost`)
# We identify the unique publisher names in the dataset
publisher_names <- unique(AirFrance$`Publisher Name`)
publisher_names
## [1] "Yahoo - US" "MSN - Global" "Google - Global"
## [4] "Overture - Global" "Google - US" "Overture - US"
## [7] "MSN - US"
# We tally the total sales for each publisher
sales_by_publisher <- c()
for (i in 1:length(publisher_names)) {
# We subset the dataset to include only rows with the current publisher name
publisher_sales <- AirFrance$Amount[which(AirFrance[, 2] == publisher_names[i])]
# We sum up the sales for the current publisher and add it to our sales vector
sales_by_publisher <- c(sales_by_publisher, sum(publisher_sales))
i <- i + 1
}
# We combine the publisher names and their corresponding sales figures into a single dataframe
publisher_sales_df <- cbind(publisher_names, as.numeric(sales_by_publisher))
publisher_sales_df
## publisher_names
## [1,] "Yahoo - US" "882288.95"
## [2,] "MSN - Global" "145524.25"
## [3,] "Google - Global" "929549.8"
## [4,] "Overture - Global" "430084.7"
## [5,] "Google - US" "1745481.8"
## [6,] "Overture - US" "347433.25"
## [7,] "MSN - US" "181549.8"
# We identify the publisher with the highest sales
max_sales <- max(sales_by_publisher)
top_publisher <- publisher_names[which(sales_by_publisher == max_sales)]
top_publisher
## [1] "Google - US"
#We can now explore Publisher Strategy i.e The Most Effective Channel
#We can start by creating a new dataframe for analysis
New_AirFrance <- AirFrance
# We calculate basic statistics for each variable of interest
Statistics <- c("Mean", "Median", "SD", "Min", "Max")
Amount_stats <- round(c(mean(New_AirFrance$Amount), median(New_AirFrance$Amount), sd(New_AirFrance$Amount), min(New_AirFrance$Amount), max(New_AirFrance$Amount)), 2)
# Calculate summary statistics for the Total Cost column
Total_Cost_stats <- round(c(mean(New_AirFrance$`Total Cost`), median(New_AirFrance$`Total Cost`), sd(New_AirFrance$`Total Cost`), min(New_AirFrance$`Total Cost`), max(New_AirFrance$`Total Cost`)), 2)
Impressions_stats <- round(c(mean(New_AirFrance$Impressions), median(New_AirFrance$Impressions), sd(New_AirFrance$Impressions), min(New_AirFrance$Impressions), max(New_AirFrance$Impressions)), 2)
Clicks_stats <- round(c(mean(New_AirFrance$Clicks), median(New_AirFrance$Clicks), sd(New_AirFrance$Clicks), min(New_AirFrance$Clicks), max(New_AirFrance$Clicks)), 2)
# We combine the statistics for each variable into a summary dataframe
Summary <- as.data.frame(cbind(Statistics, Amount_stats, Total_Cost_stats, Impressions_stats, Clicks_stats))
# We remove any rows with Total Cost = 0, as this will cause an infinite value when calculating ROA
New_AirFrance <- New_AirFrance[New_AirFrance$`Total Cost` != 0,]
# We can now create new columns for revenue, ROA, booking probability, cost per booking, and average revenue per booking
New_AirFrance$Revenue <- New_AirFrance$Amount - New_AirFrance$`Total Cost`
New_AirFrance$ROA <- round(New_AirFrance$Revenue / New_AirFrance$`Total Cost`, 2)
New_AirFrance$Book_Prob <- round((New_AirFrance$`Trans. Conv. %` * New_AirFrance$`Engine Click Thru %`) / 100, 2)
New_AirFrance$Cost_Book <- round(New_AirFrance$`Total Cost` / New_AirFrance$`Total Volume of Bookings`, 2)
New_AirFrance[New_AirFrance == ""] <- 0
New_AirFrance[New_AirFrance == "Inf"] <- 0
New_AirFrance$Average_Revenue_Booking <- round(New_AirFrance$Amount / New_AirFrance$`Total Volume of Bookings`, 2)
# We can check the summary statistics of our new dataframe
summary(New_AirFrance)
## Publisher ID Publisher Name Keyword ID Keyword
## Length:4509 Length:4509 Length:4509 Length:4509
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Match Type Campaign Keyword Group Category
## Length:4509 Length:4509 Length:4509 Length:4509
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Bid Strategy Keyword Type Status Search Engine Bid
## Length:4509 Length:4509 Length:4509 Min. : 0.000
## Class :character Class :character Class :character 1st Qu.: 3.275
## Mode :character Mode :character Mode :character Median : 6.250
## Mean : 5.434
## 3rd Qu.: 6.250
## Max. :27.500
##
## Clicks Click Charges Avg. Cost per Click Impressions
## Min. : 1.0 Min. : 0.05 Min. : 0.0392 Min. : 1
## 1st Qu.: 1.0 1st Qu.: 2.31 1st Qu.: 0.8250 1st Qu.: 28
## Median : 4.0 Median : 6.76 Median : 1.6510 Median : 176
## Mean : 113.7 Mean : 167.51 Mean : 1.8907 Mean : 9286
## 3rd Qu.: 19.0 3rd Qu.: 28.50 3rd Qu.: 2.6625 3rd Qu.: 844
## Max. :34012.0 Max. :46188.44 Max. :10.0000 Max. :8342415
##
## Engine Click Thru % Avg. Pos. Trans. Conv. % Total Cost/ Trans.
## Min. : 0.01287 Min. : 0.000 Min. : 0.0000 Min. : 0.00
## 1st Qu.: 1.53489 1st Qu.: 1.143 1st Qu.: 0.0000 1st Qu.: 0.00
## Median : 4.10866 Median : 1.594 Median : 0.0000 Median : 0.00
## Mean : 11.14392 Mean : 1.930 Mean : 0.5694 Mean : 27.61
## 3rd Qu.: 10.91954 3rd Qu.: 2.308 3rd Qu.: 0.0000 3rd Qu.: 0.00
## Max. :200.00000 Max. :15.000 Max. :900.0000 Max. :9597.17
##
## Amount Total Cost Total Volume of Bookings
## Min. : 0 Min. : 0.05 Min. : 0.0000
## 1st Qu.: 0 1st Qu.: 2.31 1st Qu.: 0.0000
## Median : 0 Median : 6.76 Median : 0.0000
## Mean : 1034 Mean : 167.51 Mean : 0.8734
## 3rd Qu.: 0 3rd Qu.: 28.50 3rd Qu.: 0.0000
## Max. :567463 Max. :46188.44 Max. :439.0000
##
## ROA Revenue Book_Prob Cost_Book
## Min. : -1.000 Min. : -8725.9 Min. : 0.00000 Min. : 0.00
## 1st Qu.: -1.000 1st Qu.: -18.9 1st Qu.: 0.00000 1st Qu.: 0.00
## Median : -1.000 Median : -5.0 Median : 0.00000 Median : 0.00
## Mean : 3.415 Mean : 866.3 Mean : 0.06812 Mean : 27.61
## 3rd Qu.: -1.000 3rd Qu.: -1.6 3rd Qu.: 0.00000 3rd Qu.: 0.00
## Max. :3794.870 Max. :549524.1 Max. :81.82000 Max. :9597.17
##
## Average_Revenue_Booking
## Min. : 34.0
## 1st Qu.: 565.7
## Median : 900.1
## Mean :1025.9
## 3rd Qu.:1281.0
## Max. :5877.8
## NA's :4142
# Check data type of 'Match Type' column
typeof(New_AirFrance$`Match Type`)
## [1] "character"
# Check the first few rows of the new dataframe
head(New_AirFrance)
## # A tibble: 6 × 28
## `Publisher ID` Publi…¹ Keywo…² Keyword Match…³ Campa…⁴ Keywo…⁵ Categ…⁶ Bid S…⁷
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 K2615 Yahoo … 430000… fly to… Advanc… Wester… Floren… uncate… <NA>
## 2 K2615 Yahoo … 430000… low in… Advanc… Geo Ta… Low In… uncate… <NA>
## 3 K2003 MSN - … 430000… air di… Broad Air Fr… France uncate… Positi…
## 4 K1175 Google… 430000… [airfr… Exact Air Fr… Air Fr… airfra… Positi…
## 5 K1123 Overtu… 430000… air fr… Standa… Unassi… Unassi… airfra… Positi…
## 6 K1123 Overtu… 430000… airfra… Standa… Unassi… Unassi… airfra… Positi…
## # … with 19 more variables: `Keyword Type` <chr>, Status <chr>,
## # `Search Engine Bid` <dbl>, Clicks <dbl>, `Click Charges` <dbl>,
## # `Avg. Cost per Click` <dbl>, Impressions <dbl>,
## # `Engine Click Thru %` <dbl>, `Avg. Pos.` <dbl>, `Trans. Conv. %` <dbl>,
## # `Total Cost/ Trans.` <dbl>, Amount <dbl>, `Total Cost` <dbl>,
## # `Total Volume of Bookings` <dbl>, ROA <dbl>, Revenue <dbl>,
## # Book_Prob <dbl>, Cost_Book <dbl>, Average_Revenue_Booking <dbl>, and …
# We create Pivot Table to compare ROA and Avg. Cost per Click
AFdf_pivot <- New_AirFrance %>% group_by(`Publisher Name`) %>% summarize(
avg_ROA = mean(ROA),
avg_cpc = mean(`Avg. Cost per Click`)
)
summary(AFdf_pivot)
## Publisher Name avg_ROA avg_cpc
## Length:7 Min. : 1.181 Min. :0.7639
## Class :character 1st Qu.: 1.219 1st Qu.:1.4018
## Mode :character Median : 4.485 Median :2.1530
## Mean : 4.827 Mean :1.8854
## 3rd Qu.: 7.518 3rd Qu.:2.3050
## Max. :10.649 Max. :2.8675
#We Create the second Pivot Table For Bubble Chart to compare different SEM
AFdf_pivot2 <- New_AirFrance %>% group_by(`Publisher Name`) %>% summarize(
total_records = n(),
total_amount = sum(`Total Cost`),
avg_cpc = mean(`Avg. Cost per Click`),
avg_prob = mean(`Book_Prob`),
avg_ROA = mean(ROA)
)
summary(AFdf_pivot2)
## Publisher Name total_records total_amount avg_cpc
## Length:7 Min. : 98.0 Min. : 12160 Min. :0.7639
## Class :character 1st Qu.: 246.0 1st Qu.: 31148 1st Qu.:1.4018
## Mode :character Median : 553.0 Median : 64296 Median :2.1530
## Mean : 644.1 Mean :107902 Mean :1.8854
## 3rd Qu.: 648.0 3rd Qu.:131461 3rd Qu.:2.3050
## Max. :2070.0 Max. :353641 Max. :2.8675
## avg_prob avg_ROA
## Min. :0.00646 Min. : 1.181
## 1st Qu.:0.02718 1st Qu.: 1.219
## Median :0.05449 Median : 4.485
## Mean :0.06896 Mean : 4.827
## 3rd Qu.:0.10063 3rd Qu.: 7.518
## Max. :0.16614 Max. :10.649
#Bubble Chart
p <- plot_ly(AFdf_pivot2, x = ~avg_prob, y = ~avg_ROA,
textposition = "auto",
type = 'scatter',
mode = 'markers',
size = ~avg_cpc,
color = ~`Publisher Name`,
colors = 'Paired',
marker = list(opacity = 0.8, sizemode = 'diameter')) %>%
layout(title = 'Publisher Strategy',
xaxis = list(title = "Probability of Booking", showgrid = TRUE),
yaxis = list(title = "Average ROA", showgrid = TRUE),
showlegend = TRUE)
p